# install.packages("renv")
#
# renv::init()
#
# install.packages("tidyverse")
# install.packages("ggplot2")
# install.packages("cowplot")
# install.packages("sf")
# install.packages("rnaturalearth")
# install.packages("rnaturalearthdata")
# install.packages("slendr")
# install.packages("devtools")
# install.packages("magick")
# install.packages("gifski")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(sf)
## Linking to GEOS 3.10.2, GDAL 3.4.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(rnaturalearth)
library(rnaturalearthdata)
##
## Attaching package: 'rnaturalearthdata'
##
## The following object is masked from 'package:rnaturalearth':
##
## countries110
library(gganimate)
library(slendr)
## =======================================================================
## NOTE: Due to Python setup issues on some systems which have been
## causing trouble particularly for novice users, calling library(slendr)
## no longer activates slendr's Python environment automatically.
##
## In order to use slendr's msprime back end or its tree-sequence
## functionality, users must now activate slendr's Python environment
## manually by executing init_env() after calling library(slendr).
##
## (This note will be removed in the next major version of slendr.)
## =======================================================================
#slendr::setup_env()
slendr::init_env()
## The interface to all required Python modules has been activated.
simulation_path = "../../../data/europe"
gg_color_hue <- function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
colors = gg_color_hue(6)
col_pop = c("AFR" = colors[1], "OOA" = colors[2], "EHG" = colors[3],
"ANA" = colors[4], "EUR" = colors[5], "YAM" = colors[6])
sampling_uniform_intime <- function(n, pop_timing, pops_list, gen_time = 30, min = 500, max = 51000){
samples <- data.frame()
i = 0
while(samples %>% nrow() < 1 || sum(samples$n) < n){
t = round(runif(n = 1, min = min, max = max))
if(pop_timing %>% dplyr::filter(start > t+gen_time, end < t-gen_time) %>% nrow() > 0){
i = i + 1
pop_idx = pop_timing %>%
dplyr::filter(start > t+gen_time, end < t-gen_time) %>%
dplyr::sample_n(1) %>%
dplyr::pull(index)
pop_nam = pop_timing %>% dplyr::filter(index == pop_idx) %>% dplyr::pull(pop)
if(samples %>% nrow() > 1 && samples %>% dplyr::filter(time == t, pop == pop_nam) %>% nrow() > 0){
samples <- samples %>% dplyr::mutate(n = ifelse(time == t & pop == pop_nam, n+1, n))
}else{
samples <- rbind(samples, schedule_sampling(model, times = t, list( pops_list[[pop_idx]], 1 )))
}
}
}
return(samples)
}
map <- world(xrange = c(-13, 70), yrange = c(18, 65),crs = "EPSG:3035")
print(map)
## slendr 'map' object
## -------------------
## map: internal coordinate reference system EPSG 3035
## spatial limits (in degrees longitude and latitude):
## - vertical -13 ... 70
## - horizontal 18 ... 65
plot_map(map)
africa <- region(
"Africa", map,
polygon = list(c(-18, 20), c(38, 20), c(30, 33),
c(20, 33), c(10, 38), c(-6, 35))
)
europe <- region(
"Europe", map,
polygon = list(
c(-8, 35), c(-5, 36), c(10, 38), c(20, 35), c(25, 35),
c(33, 45), c(20, 58), c(-5, 60), c(-15, 50)
)
)
anatolia <- region(
"Anatolia", map,
polygon = list(c(28, 35), c(40, 35), c(42, 40),
c(30, 43), c(27, 40), c(25, 38))
)
plot_map(africa, europe, anatolia)
# African ancestral population
# ----------------------------
afr <- population("AFR", time = 52000, N = 3000, map = map, polygon = africa)
# population of the first migrants out of Africa
# ----------------------------------------------
ooa <- population("OOA", parent = afr, time = 51000, N = 500, remove = 25000, center = c(33, 30), radius = 400e3) %>%
move(trajectory = list(c(40, 30), c(50, 30), c(60, 40)), start = 50000, end = 40000, snapshots = 20)
# Eastern hunter-gatherers
# ------------------------
ehg <- population("EHG", parent = ooa, time = 28000, N = 1000, remove = 6000, polygon = list(c(26, 55), c(38, 53), c(48, 53), c(60, 53), c(60, 60), c(48, 63), c(38, 63), c(26, 60)))
# European population
# -------------------
eur <- population( name = "EUR", parent = ehg, time = 25000, N = 2000, polygon = europe)
# Anatolian farmers
# -----------------
ana <- population( name = "ANA", time = 28000, N = 3000, parent = ooa, remove = 4000, center = c(34, 38), radius = 500e3, polygon = anatolia) %>%
expand_range(by = 2500e3, start = 10000, end = 7000, polygon = join(europe, anatolia), snapshots = 20)
# Yamnaya steppe population
# -------------------------
yam <- population( name = "YAM", time = 7000, N = 500, parent = ehg, remove = 2500, polygon = list(c(26, 50), c(38, 49), c(48, 50), c(48, 56), c(38, 59), c(26, 56))) %>%
move(trajectory = list(c(15, 50)), start = 5000, end = 3000, snapshots = 10)
# Gene flow events
# ----------------
gf <- list(
gene_flow(from = ana, to = yam, rate = 0.5, start = 6500, end = 6400, overlap = FALSE),
gene_flow(from = ana, to = eur, rate = 0.5, start = 8000, end = 6000),
gene_flow(from = yam, to = eur, rate = 0.75, start = 4000, end = 3000)
)
plot_map(afr, ooa, ehg, eur, ana, yam)
if(file.exists(paste(simulation_path, "/populations.tsv", sep = "" ))){
model <- read_model(simulation_path)
}else{
model <- compile_model(
populations = list(afr, ooa, ehg, eur, ana, yam),
gene_flow = gf,
generation_time = 30,
resolution = 10e3, # resolution in meters per pixel
competition = 130e3, mating = 100e3, # spatial interaction in SLiM
dispersal = 70e3, # how far will offspring end up from their parents
path = simulation_path,
overwrite = TRUE
)
}
model
## slendr 'model' object
## ---------------------
## populations: AFR, OOA, EHG, ANA, EUR, YAM
## geneflow events: 3
## generation time: 30
## time direction: backward
## total running length: 52000 model time units
## model type: spatial
## - number of spatial maps: 60
## - resolution: 10000 distance units per pixel
##
## configuration files in: /Users/moisescollmacia/Documents/spaceNNtime/data/europe
This next cell opens a shiny app to explore the model further
#explore_model(model)
plot_grid(
plot_map(afr, ooa, ehg, eur, ana, yam),
plot_model(model) + ggplot2::theme(legend.position = "none"))
I’ve made my own function to sample uniform in time but I’m using a
slendr function under the hood. This function takes lots of
time to run.
set.seed(1234)
pops_list <- list(afr, ooa, ehg, eur, ana, yam)
pop_timing <- data.frame(pop = c("AFR", "OOA", "EHG", "EUR", "ANA", "YAM"),
start = c(52000, 51000, 28000, 25000, 28000, 7000),
end = c( 0, 25000, 6000, 0, 4000, 2500),
index = c( 1, 2, 3, 4, 5, 6)) %>%
dplyr::filter(pop != "AFR")
if(! file.exists(paste(simulation_path, "/input_slim_sampling.tsv", sep = ""))){
print("File does not exist, creating sampling dataframe")
samples <- sampling_uniform_intime(15000, pop_timing, pops_list)
write.table(samples, file = paste(simulation_path, "/input_slim_sampling.tsv", sep = ""), quote=FALSE, col.names=TRUE)
}else{
print("File exist, reading...")
samples <- read.table(paste(simulation_path, "/input_slim_sampling.tsv", sep = ""), header = T)
}
## [1] "File exist, reading..."
head(samples)
## time pop n y_orig x_orig y x
## 1 6242 YAM 1 NA NA NA NA
## 2 31268 OOA 1 NA NA NA NA
## 3 43976 OOA 1 NA NA NA NA
## 4 980 EUR 1 NA NA NA NA
## 5 34137 OOA 1 NA NA NA NA
## 6 35526 OOA 1 NA NA NA NA
samples %>%
dplyr::summarize(n = sum(n)) %>%
pull(n)
## [1] 15000
samples %>% summary()
## time pop n y_orig x_orig
## Min. : 502 Length:13639 Min. :1.0 Mode:logical Mode:logical
## 1st Qu.:12884 Class :character 1st Qu.:1.0 NA's:13639 NA's:13639
## Median :24883 Mode :character Median :1.0
## Mean :25308 Mean :1.1
## 3rd Qu.:37718 3rd Qu.:1.0
## Max. :50959 Max. :4.0
## y x
## Mode:logical Mode:logical
## NA's:13639 NA's:13639
##
##
##
##
samples %>%
group_by(pop) %>%
summarize(max_time = max(time), min_time = min(time))
## # A tibble: 5 × 3
## pop max_time min_time
## <chr> <int> <int>
## 1 ANA 27963 4036
## 2 EHG 27966 6034
## 3 EUR 24962 502
## 4 OOA 50959 25036
## 5 YAM 6944 2543
read.table(file.path(simulation_path, "populations.tsv"), header = TRUE)
## pop parent N tsplit_gen tsplit_orig tremove_gen tremove_orig
## 1 AFR __pop_is_ancestor 3000 1 52000 -1 -1
## 2 OOA AFR 500 34 51000 901 25000
## 3 EHG OOA 1000 801 28000 1534 6000
## 4 ANA OOA 3000 801 28000 1601 4000
## 5 EUR EHG 2000 901 25000 -1 -1
## 6 YAM EHG 500 1501 7000 1651 2500
## pop_id parent_id
## 1 0 -1
## 2 1 0
## 3 2 1
## 4 3 1
## 5 4 2
## 6 5 2
So, from the dataframe above, we extract the following dictionary
ids_pop = c("0" = "AFR",
"1" = "OOA",
"2" = "EHG",
"3" = "ANA",
"4" = "EUR",
"5" = "YAM")
ids_pop
## 0 1 2 3 4 5
## "AFR" "OOA" "EHG" "ANA" "EUR" "YAM"
plot_grid(
samples %>%
tidyr::uncount(n) %>%
dplyr::mutate(pop = factor(pop, levels = c("OOA", "EHG", "ANA", "EUR", "YAM"))) %>%
ggplot() +
geom_jitter(aes(x = pop, y = time/1000, color = pop)) +
scale_color_manual(values=col_pop) +
xlab("") +
ylab("Time (years x1000)") +
theme_light() +
theme(legend.position = "none"),
samples %>%
tidyr::uncount(n) %>%
ggplot() +
geom_histogram(aes(y = time, fill = pop), bins = 30) +
theme_light() +
scale_fill_manual(values=col_pop) +
theme(axis.ticks.y = element_blank(), axis.title.y = element_blank(), axis.text.y= element_blank(), legend.position = "none"),
rel_widths= c(2, 1))
if(file.exists(paste(simulation_path, "/output_slim.trees", sep = ""))){
print("Simulations already done!")
}else{
slim(
model = model,
sequence_length = 1e6,
recombination_rate = 1e-8, # simulate only a single locus
samples = samples,
output = paste(simulation_path, "/output_slim.trees", sep = ""),
method = "batch", # change to "gui" to execute the model in SLiMgui
random_seed = 1234,
verbose = TRUE,
load = FALSE,
locations = paste(simulation_path, "/output_locations.txt", sep = "") # save the location of everyone who ever lived
)
}
## [1] "Simulations already done!"
if(!file.exists(paste(simulation_path, "/sim.gif", sep = ""))){
animate_model(model = model,
gif = paste(simulation_path, "/sim.gif", sep = ""),
file = paste(simulation_path, "/output_locations.txt.gz", sep = ""),
steps = 100,
width = 500,
height = 300)
}
rec_rate <- 1e-8
mut_rate <- 1e-8
ne <- 3000
ran_seed <- 1234
model <- read_model(simulation_path)
if(!file.exists(paste(simulation_path, "/tree.trees", sep = ""))){
print(paste("The tree file ", simulation_path, "/tree.trees does not exist... Generating it...", sep = ""))
ts <- ts_load(file = paste(simulation_path, "/output_slim.trees", sep = ""),
model = model,
recapitate = TRUE,
simplify = TRUE,
mutate = TRUE,
recombination_rate = rec_rate,
mutation_rate = 1e-8,
Ne = ne,
random_seed = ran_seed)
ts_save(ts, file = paste(simulation_path, "/tree.trees", sep = ""))
}else{
print(paste("The tree file ", simulation_path, "/tree.trees already exist... Loading it...", sep = ""))
ts <- ts_load(file = paste(simulation_path, "/tree.trees", sep = ""))
}
## [1] "The tree file ../../../data/europe/tree.trees already exist... Loading it..."
if(!file.exists(paste(simulation_path, "/metadata.txt", sep = ""))){
print(paste("The data file ", simulation_path, "/metadata.txt does not exist... Generating it...", sep = ""))
data <- ts_nodes(ts)
data.frame(ind_id = data$ind_id,
pedigree_id = data$pedigree_id,
pop_id = data$pop_id,
node_id = data$node_id,
location = data$location,
time = data$time,
sampled = data$sampled) %>%
filter(sampled) %>%
rowwise() %>%
mutate(lat = st_transform(geometry, 4326)[[1]][2],
lon = st_transform(geometry, 4326)[[1]][1]) %>%
select(-c(geometry, pedigree_id)) %>%
group_by(ind_id, pop_id, time, sampled, lat, lon) %>%
mutate(n = 1, n = cumsum(n), n = paste("node", n, sep = "")) %>%
spread(n, node_id) %>%
ungroup() %>%
mutate(pop = ids_pop[as.character(pop_id)]) %>%
write.table(., file = paste(simulation_path, "/metadata.txt", sep = ""),
append = FALSE,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE)
}else{
print(paste("The tree file ", simulation_path, "/tree.trees already exist...", sep = ""))
}
## [1] "The tree file ../../../data/europe/tree.trees already exist..."
metadata <- read.table(paste(simulation_path, "/metadata.txt", sep = ""), header = T)
metadata %>% head()
## ind_id pop_id time sampled lat lon node1 node2 pop
## 1 72 1 50959 TRUE 27.10800 31.31279 0 1 OOA
## 2 73 1 50957 TRUE 30.03180 32.25275 2 3 OOA
## 3 74 1 50948 TRUE 30.32478 34.35868 4 5 OOA
## 4 75 1 50946 TRUE 30.11166 34.13403 6 7 OOA
## 5 76 1 50941 TRUE 27.81365 33.82782 8 9 OOA
## 6 77 1 50941 TRUE 30.69545 36.59060 10 11 OOA
world <- ne_countries(scale = "medium", returnclass = "sf")
metadata %>%
sample_frac(size = 0.1) %>%
mutate(time_bin = trunc(time/5000),
time_frame = time%%5000) %>%
ggplot() +
geom_sf(data = world) +
coord_sf(xlim = c(-25, 70), ylim = c(20, 80), expand = FALSE) +
geom_point(aes(x = lon, y = lat, color = pop, alpha = time_frame)) +
facet_grid(pop~time_bin) +
theme_bw() +
scale_color_manual(values=col_pop) +
xlab("Longitude") +
ylab("Latitude")